nucs <- read.csv("EH23a.softmasked_nuccomp.csv")
nucs$GC <- rowSums( nucs[, c("C", "c", "G", "g")] )/rowSums( nucs[, c("A", "a", "C", "c", "G", "g", "T", "t")] )
nucs$GC <- nucs$GC * 100
gff <- read.table("../Figure1/EH23a.primary_high_confidence.gff3.gz", sep = "\t", quote = "\"")
#
gff[1:3, 1:8]
## V1 V2 V3 V4 V5 V6 V7 V8
## 1 EH23a.chr1 annotation remark 1 65878799 . . .
## 2 EH23a.chr1 feature gene 11068 24494 . + .
## 3 EH23a.chr1 feature mRNA 11068 24494 . + .
genes <- gff[ gff[, 3] == "gene", ]
gff <- read.table("../Figure1/EH23a.unmasked.fasta.mod.EDTA.intact.gff3", sep = "\t", quote = "\"")
gff$chrn <- sub("^.+chr", "", gff$V1)
gff$chrn[ gff$chrn == "X" ] <- 10
gff$chrn <- as.numeric(gff$chrn)
gff$classification <- unlist(lapply(strsplit(gff[,9], split = ";"), function(x){ grep("Classification=", x, value = TRUE) }))
gff$classification <- sub("^Classification=", "", gff$classification)
gff$classification <- sub("^LTR/Copia", "Ty1", gff$classification)
gff$classification <- sub("^LTR/Gypsy", "Ty3", gff$classification)
#
gff[1:8, c(1:8, 10)]
## V1 V2 V3 V4 V5 V6 V7 V8 chrn
## 1 EH23a.chr1 EDTA repeat_region 48463 58586 . - . 1
## 2 EH23a.chr1 EDTA target_site_duplication 48463 48467 . - . 1
## 3 EH23a.chr1 EDTA long_terminal_repeat 48468 50359 . - . 1
## 4 EH23a.chr1 EDTA Copia_LTR_retrotransposon 48468 58581 . - . 1
## 5 EH23a.chr1 EDTA long_terminal_repeat 56690 58581 . - . 1
## 6 EH23a.chr1 EDTA target_site_duplication 58582 58586 . - . 1
## 7 EH23a.chr1 EDTA repeat_region 67125 71966 . + . 1
## 8 EH23a.chr1 EDTA target_site_duplication 67125 67129 . + . 1
# Ty3
Ty3 <- gff[ gff$classification == "Ty3", ]
Ty3 <- gff[ gff$V3 == "Gypsy_LTR_retrotransposon", ]
Ty3[1:8, c(1:8, 10:11)]
## V1 V2 V3 V4 V5 V6 V7 V8 chrn
## 81 EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 2110985 2122540 . ? . 1
## 87 EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 2180122 2191426 . + . 1
## 161 EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 3614107 3626646 . - . 1
## 206 EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 3905221 3916714 . ? . 1
## 228 EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 4168059 4180534 . - . 1
## 242 EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 4447228 4458755 . - . 1
## 283 EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 4885633 4891899 . - . 1
## 305 EH23a.chr1 EDTA Gypsy_LTR_retrotransposon 5232888 5239051 . + . 1
## classification
## 81 Ty3
## 87 Ty3
## 161 Ty3
## 206 Ty3
## 228 Ty3
## 242 Ty3
## 283 Ty3
## 305 Ty3
#table(gff$classification, gff$V1)
tetbl <- table(gff$V1, gff$classification)
tetbl <- rbind(tetbl, colMeans(tetbl))
#tetbl
Ty3$motif <- sub("^motif=", "", unlist(lapply(strsplit(Ty3[, 9], split = ";"), function(x){ grep("motif", x, value = TRUE) })))
my_tbl <- table(Ty3$motif, Ty3$chrn)
#my_tbl[, 1:10]
my_mat <- as.matrix(my_tbl)
#my_mat[, 1:10]
my_mat2 <- my_mat[apply(my_mat, MARGIN = 1, function(x){ sum( x != 0 ) }) == 1, ]
my_mat2 <- my_mat2[sort.int(as.numeric(apply(my_mat2, MARGIN = 1, function(x){ paste(x, collapse = "") })),
index.return = T)$ix, ]
my_mat2
##
## 1 2 3 4 5 6 7 8 9 10
## TTTA 0 0 0 0 0 0 0 0 1 0
## TGAG 0 0 0 0 0 0 0 1 0 0
## TATC 0 0 0 0 0 0 1 0 0 0
## TATT 0 0 0 0 0 0 1 0 0 0
## TTCC 0 0 0 0 0 0 1 0 0 0
## TGTG 0 0 0 0 0 1 0 0 0 0
## TCAC 0 0 0 0 1 0 0 0 0 0
## TAGC 0 0 0 0 2 0 0 0 0 0
## TACC 0 0 0 1 0 0 0 0 0 0
## TTAG 0 0 0 1 0 0 0 0 0 0
## TTGC 0 0 0 1 0 0 0 0 0 0
## TAAT 0 0 1 0 0 0 0 0 0 0
## TCCA 0 0 1 0 0 0 0 0 0 0
## TGCT 0 0 1 0 0 0 0 0 0 0
## TAAC 0 1 0 0 0 0 0 0 0 0
## TCGT 0 1 0 0 0 0 0 0 0 0
## TCTG 0 1 0 0 0 0 0 0 0 0
## TACT 1 0 0 0 0 0 0 0 0 0
## TGTC 1 0 0 0 0 0 0 0 0 0
## TTCA 1 0 0 0 0 0 0 0 0 0
#wins <- read.csv("EH23a.softmasked_wins1e5.csv")
#
wins <- read.csv("EH23a.softmasked_wins1e6.csv")
#wins <- read.csv("EH23a.softmasked_wins1e7.csv")
wins$CGs <- 0
for( i in unique(wins$Id) ){
wins$CGs[ wins$Id == i] <- wins$CG[ wins$Id == i] - min(wins$CG[ wins$Id == i], na.rm = TRUE)
wins$CGs[ wins$Id == i] <- wins$CGs[ wins$Id == i]/max(wins$CGs[ wins$Id == i], na.rm = TRUE)
}
wins$chrn <- sub(".+chr", "", wins$Id)
wins$chrn[ wins$chrn == "X" ] <- 10
wins$chrn <- as.numeric(wins$chrn)
wins$ATs <- 1 - wins$CGs
#wins[1:3, ]
cwins <- wins[wins$CGs == 1 & !is.na(wins$CGs) , ]
cwins <- cwins[ !duplicated( cwins$Id ), ]
cwins$cent <- cwins$Start + cwins$Win_length/2
vars <- read.table("ERBxHO40vars.csv.gz", header = TRUE, sep = ",")
nrow(vars)
## [1] 450759
vars <- vars[vars$Missing == 0, ]
nrow(vars)
## [1] 35186
vars[1:3, ]
## CHROM POS n Allele_counts He Ne REFhom HET ALThom
## 48 chr_1e 62350 270 509,31 0.1082236 1.121357 240 29 1
## 102 chr_1e 98473 270 412,128 0.3617010 1.566664 145 122 3
## 220 chr_1e 204226 270 263,277 0.4996639 1.998657 65 133 72
## Missing
## 48 0
## 102 0
## 220 0
vars$ERBallele <- as.numeric(unlist(lapply(strsplit(vars$Allele_counts, split = ","), function(x){x[1]})))
vars$HO40allele <- as.numeric(unlist(lapply(strsplit(vars$Allele_counts, split = ","), function(x){x[2]})))
vars$ERBallele <- vars$ERBallele / (vars$n * 2)
vars$HO40allele <- vars$HO40allele / (vars$n * 2)
my_chrom <- "chr_1e"
plot( vars$POS[ vars$CHROM == my_chrom ], vars$ERBallele[ vars$CHROM == my_chrom ],
pch = 20, col = "#C0C0C066", ylim = c(0, 1))
points( vars$POS[ vars$CHROM == my_chrom ], vars$HO40allele[ vars$CHROM == my_chrom ],
pch = 20, col = "#4682B4")
vars$chrn <- sub("chr_", "", vars$CHROM)
vars$chrn <- sub("e$", "", vars$chrn)
vars$chrn[ vars$chrn == "X" ] <- 10
vars$chrn <- as.numeric(vars$chrn)
table(vars$chrn)
##
## 1 2 3 4 5 6 7 8 9 10
## 2933 3228 4869 5716 3184 2886 2317 1975 4690 3388
library(ggplot2)
ggplot(vars, aes(x=chrn, y=POS, group=CHROM)) +
geom_point()
my_cols1 <- paste(RColorBrewer::brewer.pal( n = 12, name = "Set3"), "08", sep="")
#my_cols1 <- paste(RColorBrewer::brewer.pal( n = 12, name = "Paired"), "08", sep="")
hist(vars$ERBallele[vars$chrn == 2], xlim = c(0, 1))
hist(vars$HO40allele[vars$chrn == 2], xlim = c(0, 1))
p <- ggplot(vars, aes(x=chrn+ERBallele-0.5, y=POS, group=CHROM)) +
geom_point( aes( color = CHROM ) )
p <- p + scale_color_manual(values=my_cols1)
#p <- p + geom_point( data = vars, aes(x = chrn-HO40allele, y=POS), color = vars$chrn)
#
p <- p + geom_point( data = vars, aes(x = chrn+HO40allele-0.5, y=POS), color = my_cols1[vars$chrn])
#p <- p + geom_point( data = vars, aes(x = chrn-0, y=POS), color = my_cols1[vars$chrn])
#p <- p + scale_color_manual(values=my_cols1)
p <- p + theme_bw()
p <- p + theme(legend.position='none')
p <- p + scale_x_continuous(breaks=seq(1, 10, 1))
p <- p + xlab("Chromosome")
p
#p + scale_color_brewer(palette="Dark2")
#p + scale_color_brewer(palette="Set3")
# geom_point( aes( color = CHROM, alpha = 1/10 ) )
# geom_point( aes( alpha = 1/10 ) )
# wins[1:3, ]
# genes[1:3, 1:8]
wins$gcnt <- 0
wins$ty1cnt <- 0
wins$ty3cnt <- 0
for(i in 1:nrow(wins)){
tmp <- genes[genes$V1 == wins$Id[i] & genes$V4 >= wins$Start[i] & genes$V5 < wins$End[i], ]
wins$gcnt[i] <- nrow(tmp)
tmp <- gff[gff$V1 == wins$Id[i] & gff$V4 >= wins$Start[i] & gff$V5 < wins$End[i], ]
wins$ty1cnt[i] <- sum(tmp$classification == "Ty1")
wins$ty3cnt[i] <- sum(tmp$classification == "Ty3")
}
wins$gcntsc <- wins$gcnt - min(wins$gcnt)
wins$gcntsc <- wins$gcntsc / max(wins$gcntsc)
#range(round( wins$gcntsc * 100 ))
my_index <- round( wins$gcntsc * 100 )
#my_index <- 100 - my_index
my_index[ my_index <= 0] <- 1
#wins$gcol <- heat.colors(n=100)[ my_index ]
#wins$gcol <- colorRampPalette(c("yellow", "orange", "red"))( 100 )[ my_index ]
#
wins$gcol <- colorRampPalette(c("red", "orange", "yellow"))( 100 )[ my_index ]
#wins$gcol <- colorRampPalette(c("#0000FF", "#228B22", "#A0522D"))( 100 )[ my_index ]
#wins$gcol <- colorRampPalette(c("#87CEEB", "#3CB371", "#228B22", "#A0522D"))( 100 )[ my_index ]
wins$chr <- sub("^.+chr", "chr", wins$Id)
#
wins[1:3, ]
## Id Win_number Start End Win_length A C G T
## 1 EH23a.chr1 0 0 999999 1000000 339510 159349 157789 343352
## 2 EH23a.chr1 1 1000000 1999999 1000000 346184 152100 152139 349577
## 3 EH23a.chr1 2 2000000 2999999 1000000 343278 155236 154692 346794
## CG CHG CHH CGs chrn ATs gcnt ty1cnt ty3cnt gcntsc
## 1 14003 19695 90544 0.07933312 1 0.9206669 156 24 0 1.0000000
## 2 13018 18441 88693 0.00000000 1 1.0000000 141 24 0 0.9038462
## 3 14100 19606 89229 0.08714562 1 0.9128544 123 18 12 0.7884615
## gcol chr
## 1 #FFFF00 chr1
## 2 #FFEC00 chr1
## 3 #FFD800 chr1
# Cannabinoid BLAST results.
cann_blst <- read.csv("cann_3dom_EH23a_tblastn.csv", header = FALSE)
colnames(cann_blst) <- c('qseqid','qlen','sseqid','slen','qstart',
'qend','sstart','send','evalue','bitscore',
'score','length','pident','nident','mismatch',
'positive','gapopen','gaps','ppos','sstrand',
'sseq')
#
cann_blst <- cann_blst[grep("SignalP|FAD|BBE", cann_blst$qseqid, invert = TRUE, value = F), ]
cann_blst$name <- sub("^.+_", "", cann_blst$qseqid)
# table(cann_blst$qlen)
cann_blst <- cann_blst[ cann_blst$gaps <= 0, ]
cann_blst <- cann_blst[ cann_blst$mismatch <= 10, ]
# write.table(cann_blst, file = "cann_3dom_EH23a_tblastn_filter.csv",
# sep = ",", row.names = FALSE, col.names = FALSE, quote = FALSE)
# system("~/gits/hempy/bin/blast_to_gff.py cann_3dom_EH23a_tblastn_filter.csv")
cann_blst$chrn <- as.numeric(sub(".+chr", "", cann_blst$sseqid))
#table(cann_blst$qseqid)
# cann_blst[1:3, 1:10]
# cann_blst[1:3, 10:20]
knitr::kable(cann_blst[, c(1, 2, 3, 7, 13:15, 18, 23)], caption = "**Table 1. Cannabinoid synthase genes.**")
| qseqid | qlen | sseqid | sstart | pident | nident | mismatch | gaps | chrn | |
|---|---|---|---|---|---|---|---|---|---|
| 1 | AB212829_THCAS1 | 545 | EH23a.chr7 | 11346499 | 99.817 | 544 | 1 | 0 | 7 |
| 130 | AB212830_THCAS2 | 545 | EH23a.chr7 | 30706708 | 99.633 | 543 | 2 | 0 | 7 |
| 131 | AB212830_THCAS2 | 545 | EH23a.chr7 | 30774874 | 99.450 | 542 | 3 | 0 | 7 |
| 132 | AB212830_THCAS2 | 545 | EH23a.chr7 | 30824591 | 99.266 | 541 | 4 | 0 | 7 |
| 374 | AB292683_CBDAS2 | 546 | EH23a.chr7 | 12111396 | 98.716 | 538 | 7 | 0 | 7 |
| 504 | AB292684_CBDAS3 | 546 | EH23a.chr7 | 12111396 | 98.165 | 535 | 10 | 0 | 7 |
| 634 | LY658671.1_CBCAS1 | 545 | EH23a.chr7 | 30706708 | 99.817 | 544 | 1 | 0 | 7 |
| 635 | LY658671.1_CBCAS1 | 545 | EH23a.chr7 | 30774874 | 99.633 | 543 | 2 | 0 | 7 |
| 636 | LY658671.1_CBCAS1 | 545 | EH23a.chr7 | 30824591 | 99.450 | 542 | 3 | 0 | 7 |
#nrow(cann_blst)
blst <- read.csv("EH23a_blastn.csv", header = FALSE)
colnames(blst) <- c('qseqid','qlen','sseqid','slen','qstart',
'qend','sstart','send','evalue','bitscore',
'score','length','pident','nident','mismatch',
'positive','gapopen','gaps','ppos','sstrand',
#'staxids','sblastnames','salltitles',
'sseq')#[1:21]
#blst$sseqid <- factor(blst$sseqid, levels = c("EH23a.chr1", "EH23a.chr2", "EH23a.chr3", "EH23a.chr4", "EH23a.chr5", "EH23a.chr6", "EH23a.chr7", "EH23a.chr8", "EH23a.chr9", "EH23a.chrX"))
#blst <- blst[blst$qseqid == "CsatSD_centromere_370bp", ]
#table(blst$qseqid)
# blst[1:3, 1:10]
blst$chrn <- sub(".+chr", "", blst$sseqid)
blst$chrn[ blst$chrn == "X" ] <- 10
blst$chrn <- as.numeric(blst$chrn)
subt <- blst[grep("CsatSD_centromere_370bp", blst$qseqid), ]
cent <- blst[grep("CsatSD_centromere_237bp", blst$qseqid), ]
#blst
#knitr::kable(blst[1:3, c(1, 2, 3, 7, 13:15, 18, 22)], caption = "**Table X. Blast.**")
knitr::kable(subt[1:3, c(1, 2, 3, 7, 13:15, 18, 22)], caption = "**Table X. Blast.**")
| qseqid | qlen | sseqid | sstart | pident | nident | mismatch | gaps | chrn |
|---|---|---|---|---|---|---|---|---|
| CsatSD_centromere_370bp | 370 | EH23a.chrX | 34848 | 100.00 | 370 | 0 | 0 | 10 |
| CsatSD_centromere_370bp | 370 | EH23a.chrX | 817 | 99.73 | 369 | 1 | 0 | 10 |
| CsatSD_centromere_370bp | 370 | EH23a.chrX | 6368 | 99.73 | 369 | 1 | 0 | 10 |
plot_ideo <- function() {
suppressPackageStartupMessages(require(ggplot2))
# marker_df <- data.frame(
# chrom = rep(names(map), times = lapply(map, length)),
# pos = unlist(map),
# marker = names(unlist(map))
# )
# marker_df$chromf <- factor( marker_df$chrom, levels = names(map) )
# marker_df$chromn <- as.numeric( marker_df$chromf )
# chr_df <- data.frame(
# start = unlist(lapply(map, min)),
# end = unlist(lapply(map, max))
# )
# chr_df$chr <- names(map)
# chr_df$chrf <- factor( chr_df$chr, levels = names(map))
# chr_df$chrn <- as.numeric( chr_df$chrf )
chr_df <- data.frame(
start = 1,
end = nucs$Length,
chr = nucs$Id
)
chr_df$chrn <- sub(".+chr", "", nucs$Id)
chr_df$chrn[chr_df$chrn == "X"] <- 10
chr_df$chrn <- as.numeric(chr_df$chrn)
chrom_wid <- 0.02
p <- ggplot2::ggplot()
p <- p + ggplot2::geom_rect( data = chr_df,
ggplot2::aes( xmin = chrn - chrom_wid,
xmax = chrn + chrom_wid,
#xmin = as.numeric(as.factor(chr)) - chrom_wid,
#xmax = as.numeric(as.factor(chr)) + chrom_wid,
ymin = end, ymax = start),
#fill = "#C0C0C0",
fill = "#DCDCDC",
#fill = "#F5F5F5",
color = "#000000"
)
#p <- p + scale_y_reverse(limits = c(max_gd, 0))
#wins$Id
thinw <- 0.28
p <- p + ggplot2::geom_rect(
data = wins,
ggplot2::aes(
# xmin = chrn - CGs,
# xmax = chrn + CGs,
xmin = chrn - ATs * thinw,
xmax = chrn + ATs * thinw,
ymin = Start,
ymax = End),
fill = wins$gcol,
#fill = "#0000CD",
#fill = "#A9A9A9",
#fill = "#C0C0C0",
#fill = "#DCDCDC",
#fill = "#F5F5F5",
# color = "#000000"
color = NA
)
#p
cmwidth <- 0.4
p <- p + ggplot2::geom_rect(
data = cann_blst,
ggplot2::aes(
xmin = chrn - cmwidth,
xmax = chrn + cmwidth,
ymin = sstart,
ymax = send),
#fill = "#0000CD",
#fill = "#A9A9A9",
fill = "#C0C0C0",
#fill = "#DCDCDC",
#fill = "#F5F5F5",
#color = "#000000"
color = "#228B22"
#color = NA
)
#p
p <- p + annotate(geom="text", x=7.5, y=11.3e6, label="THCAS1",
color="#228B22", size = 3)
p <- p + annotate(geom="text", x=7.5, y=12.1e6, label="CBDAS2",
color="#0000FF", size = 3)
p <- p + annotate(geom="text", x=7.5, y=30.7e6, label="CBCAS",
color="#B22222", bg = "#ffffff", size = 3)
#p
stwidth <- 0.40
p <- p + ggplot2::geom_rect(
data = subt,
ggplot2::aes(
xmin = chrn - stwidth,
xmax = chrn + stwidth,
#xmax = chrn,
ymin = sstart,
ymax = send),
#fill = "#0000CD",
#fill = "#A9A9A9",
fill = "#C0C0C0",
#fill = "#DCDCDC",
#fill = "#F5F5F5",
#color = "#000000"
color = "#0000FF"
#color = "#228B22"
#color = NA
)
#p
mwidth <- 0.6
p <- p + ggplot2::geom_rect(
data = cent,
ggplot2::aes(
xmin = chrn - mwidth,
# xmax = chrn + mwidth,
xmax = chrn - 0.2,
ymin = sstart,
ymax = send),
#fill = "#0000CD",
#fill = "#A9A9A9",
fill = "#C0C0C0",
#fill = "#DCDCDC",
#fill = "#F5F5F5",
color = "#000000"
#color = "#0000FF"
#color = "#8A2BE2"
#color = "#228B22"
#color = NA
)
#p
# mwidth <- 0.4
# p <- p + ggplot2::geom_rect(
# data = Ty3,
# ggplot2::aes(
# xmin = chrn - 0,
# # xmin = chrn - mwidth,
# xmax = chrn + mwidth,
# # xmax = chrn,
# ymin = V4,
# ymax = V5),
# fill = "#0000FF",
# color = NA
# )
# #p
# marker_wid <- 0.1
# marker_high <- 0.4
# p <- p + ggplot2::geom_rect( data = marker_df,
# ggplot2::aes( xmin = chromn - marker_wid,
# xmax = chromn + marker_wid,
# #xmin = as.numeric(as.factor(chrom)) - marker_wid,
# #xmax = as.numeric(as.factor(chrom)) + marker_wid,
# ymin = pos - marker_high, ymax = pos + marker_high),
# fill = "#228B22", color = "#228B22"
# )
#
# p <- p + scale_y_reverse( breaks = seq(0, 2e3, by = 100) )
#p <- p + ggplot2::scale_y_reverse( minor_breaks = seq(0, 2e3, by = 20), breaks = seq(0, 2e3, by = 100) )
#p <- p + scale_x_continuous( breaks = as.numeric(as.factor(chr_df$chr) ) )
p <- p + ggplot2::scale_x_continuous( breaks = chr_df$chrn, labels = chr_df$chr )
# p <- p + scale_y_reverse(limits = c(max_gd, 0))
# p <- p + scale_y_reverse(limits = c(0, max_gd))
#p <- p + theme_bw() + theme( panel.grid.minor = element_blank() )
p <- p + ggplot2::theme_bw() +
ggplot2::theme( panel.grid.minor.x = ggplot2::element_blank(),
axis.text.x = element_text(angle = 60, hjust = 1),
panel.grid.major.y = ggplot2::element_line( linewidth = 0.4, color = "#C0C0C0", linetype = 1 ),
panel.grid.minor.y = ggplot2::element_line( linewidth = 0.4, color = "#C0C0C0", linetype = 3 )
#panel.grid.major.y = ggplot2::element_line( size = 0.4, color = "#C0C0C0", linetype = 1 ),
#panel.grid.minor.y = ggplot2::element_line( size = 0.4, color = "#C0C0C0", linetype = 3 )
)
# p <- p + ggplot2::xlab("Chromosome")
#p <- p + ylab("Distance (cM)")
#p <- p + ggplot2::ylab("Location (cM)")
p <- p + ggplot2::ylab("Position (bp)")
#p
#p
#return( invisible( NULL ) )
p
}
p1 <- plot_ideo()
p1
my_data <- readr::read_csv("gene_counts.csv", )
## Rows: 69 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Sample
## dbl (11): chr1, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX, chrY
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# names(my_data)[1] <- "Sample"
# my_data$Sample <- as.factor( my_data$Sample )
my_data
## # A tibble: 69 × 12
## Sample chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chrX chrY
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AH3Ma 4052 2763 2586 3189 2424 2535 2058 2892 2610 3689 NA
## 2 AH3Mb 4062 2807 2620 3224 2563 2604 2080 2967 2509 NA 2850
## 3 BCMa 4137 2878 2682 3256 2597 2599 2031 2953 2654 3792 NA
## 4 BCMb 4059 2703 2611 3098 2601 2615 1986 2905 2569 NA 2751
## 5 BOAXa 4097 2877 2680 3147 2444 2718 2015 2950 2612 3778 NA
## 6 BOAXb 4039 2816 2507 3159 2499 2586 1995 2914 2551 3670 NA
## 7 COFBa 4365 2681 2552 3035 2404 2457 1793 2840 2483 3625 NA
## 8 COFBb 4078 2960 2755 3300 2768 2750 2212 3058 2769 3944 NA
## 9 COSVa 4063 2834 2677 3130 2595 2580 2048 2940 2594 3722 NA
## 10 COSVb 4049 2846 2690 3175 2601 2645 2029 2977 2566 3801 NA
## # … with 59 more rows
library(tidyr)
data_long <- my_data %>%
pivot_longer( cols = !Sample, names_to = "Chrom", values_to = "Count")
#data_long$Length <- data_long$Length / 1e6
gcount <- data_long
sum(is.na(data_long$Count))
## [1] 69
data_long <- data_long[!is.na(data_long$Count), ]
data_long
## # A tibble: 690 × 3
## Sample Chrom Count
## <chr> <chr> <dbl>
## 1 AH3Ma chr1 4052
## 2 AH3Ma chr2 2763
## 3 AH3Ma chr3 2586
## 4 AH3Ma chr4 3189
## 5 AH3Ma chr5 2424
## 6 AH3Ma chr6 2535
## 7 AH3Ma chr7 2058
## 8 AH3Ma chr8 2892
## 9 AH3Ma chr9 2610
## 10 AH3Ma chrX 3689
## # … with 680 more rows
my_data <- readr::read_csv("chrom_lengths.csv")
## New names:
## Rows: 69 Columns: 12
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (1): ...1 dbl (11): chr1, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX,
## chrY
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
names(my_data)[1] <- "Sample"
my_data$Sample <- as.factor( my_data$Sample )
my_data
## # A tibble: 69 × 12
## Sample chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chrX
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AH3Ma 6.57e7 7.65e7 8.20e7 8.16e7 7.65e7 7.63e7 6.64e7 5.41e7 6.60e7 8.42e7
## 2 AH3Mb 6.63e7 7.62e7 8.11e7 8.09e7 7.80e7 7.64e7 6.65e7 5.52e7 6.50e7 NA
## 3 BCMa 6.51e7 7.58e7 8.62e7 8.12e7 7.67e7 7.53e7 6.55e7 5.48e7 6.49e7 8.33e7
## 4 BCMb 6.32e7 7.83e7 8.16e7 7.84e7 7.69e7 7.68e7 6.44e7 5.42e7 6.55e7 NA
## 5 BOAXa 6.53e7 7.96e7 8.18e7 7.80e7 7.64e7 7.91e7 6.40e7 5.48e7 6.53e7 8.32e7
## 6 BOAXb 6.50e7 7.76e7 7.95e7 7.97e7 7.57e7 7.58e7 6.49e7 5.44e7 6.60e7 8.26e7
## 7 COFBa 7.21e7 7.85e7 8.44e7 8.21e7 7.76e7 7.97e7 6.37e7 5.57e7 6.55e7 8.57e7
## 8 COFBb 6.47e7 7.71e7 8.21e7 8.16e7 7.77e7 7.71e7 6.84e7 5.52e7 6.63e7 8.46e7
## 9 COSVa 6.76e7 8.34e7 8.63e7 8.52e7 8.09e7 7.93e7 6.99e7 5.53e7 6.81e7 8.65e7
## 10 COSVb 6.61e7 8.15e7 8.74e7 8.40e7 8.46e7 8.30e7 6.50e7 5.75e7 6.78e7 8.58e7
## # … with 59 more rows, and 1 more variable: chrY <dbl>
library(tidyr)
data_long <- my_data %>%
pivot_longer( cols = !Sample, names_to = "Chrom", values_to = "Length")
data_long$Length <- data_long$Length / 1e6
data_long
## # A tibble: 759 × 3
## Sample Chrom Length
## <fct> <chr> <dbl>
## 1 AH3Ma chr1 65.7
## 2 AH3Ma chr2 76.5
## 3 AH3Ma chr3 82.0
## 4 AH3Ma chr4 81.6
## 5 AH3Ma chr5 76.5
## 6 AH3Ma chr6 76.3
## 7 AH3Ma chr7 66.4
## 8 AH3Ma chr8 54.1
## 9 AH3Ma chr9 66.0
## 10 AH3Ma chrX 84.2
## # … with 749 more rows
glength <- data_long
glength
## # A tibble: 759 × 3
## Sample Chrom Length
## <fct> <chr> <dbl>
## 1 AH3Ma chr1 65.7
## 2 AH3Ma chr2 76.5
## 3 AH3Ma chr3 82.0
## 4 AH3Ma chr4 81.6
## 5 AH3Ma chr5 76.5
## 6 AH3Ma chr6 76.3
## 7 AH3Ma chr7 66.4
## 8 AH3Ma chr8 54.1
## 9 AH3Ma chr9 66.0
## 10 AH3Ma chrX 84.2
## # … with 749 more rows
gcount
## # A tibble: 759 × 3
## Sample Chrom Count
## <chr> <chr> <dbl>
## 1 AH3Ma chr1 4052
## 2 AH3Ma chr2 2763
## 3 AH3Ma chr3 2586
## 4 AH3Ma chr4 3189
## 5 AH3Ma chr5 2424
## 6 AH3Ma chr6 2535
## 7 AH3Ma chr7 2058
## 8 AH3Ma chr8 2892
## 9 AH3Ma chr9 2610
## 10 AH3Ma chrX 3689
## # … with 749 more rows
data_long <- cbind(glength, gcount$Count)
names(data_long)[4] <- "Count"
data_long[1:3, ]
## Sample Chrom Length Count
## 1 AH3Ma chr1 65.67137 4052
## 2 AH3Ma chr2 76.48150 2763
## 3 AH3Ma chr3 81.99513 2586
my_pal <- RColorBrewer::brewer.pal(n=12, name = "Paired")
#my_pal[11] <- "#B15928"
my_pal[11] <- "#C71585"
my_pal <- paste(my_pal, "88", sep = "")
library(ggplot2)
# Basic scatter plot
p <- ggplot(data_long, aes(x=Length, y=Count, color=Chrom))
p <- p + geom_point(size=2)
#p <- p + geom_smooth(method=lm)
p <- p + geom_smooth(method=lm, se=FALSE, linewidth = 1)
p <- p + theme_bw()
p <- p + theme(legend.title = element_blank())
#p <- p + theme(legend.position = "left")
p <- p + theme(legend.position = "right")
#p <- p + theme( legend.spacing.y = unit(1.0, 'mm') )
## important additional element
#p <- p + guides(fill = guide_legend(byrow = TRUE))
p + theme(legend.spacing.y = unit(1.0, 'mm')) +
guides(fill = guide_legend(byrow = TRUE))
## `geom_smooth()` using formula = 'y ~ x'
#p <- p + scale_color_brewer(palette="Dark2")
#p <- p + scale_color_brewer(palette="Paired")
p <- p + scale_color_manual(values=my_pal)
#p <- p + scale_color_brewer(palette="Set3")
p <- p + xlab("Chromosome length (Mbp)")
p <- p + ylab("Genes per chromosome")
p2 <- p
p2
## `geom_smooth()` using formula = 'y ~ x'
#plot(wins$gcnt, wins$ty3cnt)
#plot(wins$gcnt, wins$ty1cnt)
wins[1:3, ]
## Id Win_number Start End Win_length A C G T
## 1 EH23a.chr1 0 0 999999 1000000 339510 159349 157789 343352
## 2 EH23a.chr1 1 1000000 1999999 1000000 346184 152100 152139 349577
## 3 EH23a.chr1 2 2000000 2999999 1000000 343278 155236 154692 346794
## CG CHG CHH CGs chrn ATs gcnt ty1cnt ty3cnt gcntsc
## 1 14003 19695 90544 0.07933312 1 0.9206669 156 24 0 1.0000000
## 2 13018 18441 88693 0.00000000 1 1.0000000 141 24 0 0.9038462
## 3 14100 19606 89229 0.08714562 1 0.9128544 123 18 12 0.7884615
## gcol chr
## 1 #FFFF00 chr1
## 2 #FFEC00 chr1
## 3 #FFD800 chr1
my_pal <- RColorBrewer::brewer.pal(n=12, name = "Paired")
#my_pal[11] <- "#B15928"
my_pal[11] <- "#C71585"
my_pal <- paste(my_pal, "66", sep = "")
library(ggplot2)
# Basic scatter plot
#p <- ggplot(wins, aes(x=gcnt, y=ty3cnt, color=Id))
#p <- ggplot(wins, aes(x=gcnt, y=ty3cnt, color=chr))
p <- ggplot(wins, aes(x=gcnt, y=ty3cnt))
#p <- p + geom_point(size=2, color = "#C0C0C0")
#p <- p + geom_point(size=2, color = "#77889966")
#
p <- p + geom_point(size=2, color = "#70809044")
#p <- p + geom_point(size=2)
#p <- p + geom_smooth(method=lm)
p <- p + geom_smooth(method=lm, se=FALSE, linewidth = 2)
p <- p + geom_text(x=100, y=90, label="y = (-0.4)x + 56.7", size = 4)
p <- p + theme_bw()
p <- p + theme(legend.position = "none")
#p <- p + theme(legend.title = element_blank())
#p <- p + scale_color_brewer(palette="Dark2")
#p <- p + scale_color_brewer(palette="Paired")
p <- p + scale_color_manual(values=my_pal)
#p <- p + scale_color_brewer(palette="Set3")
p <- p + xlab("Genes per window")
p <- p + ylab("Ty3 elements per window")
p3 <- p
p3
## `geom_smooth()` using formula = 'y ~ x'
lm1 <- lm( wins$ty3cnt ~ wins$gcnt )
summary(lm1)
##
## Call:
## lm(formula = wins$ty3cnt ~ wins$gcnt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56.700 -11.859 0.224 10.512 64.935
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.70020 1.09209 51.92 <2e-16 ***
## wins$gcnt -0.40185 0.02219 -18.11 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.43 on 742 degrees of freedom
## Multiple R-squared: 0.3065, Adjusted R-squared: 0.3056
## F-statistic: 327.9 on 1 and 742 DF, p-value: < 2.2e-16
lm2 <- lm( wins$ty1cnt ~ wins$gcnt )
summary(lm2)
##
## Call:
## lm(formula = wins$ty1cnt ~ wins$gcnt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34.909 -13.025 -1.495 9.910 62.720
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.71174 1.02287 30.025 < 2e-16 ***
## wins$gcnt 0.06265 0.02078 3.014 0.00266 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.32 on 742 degrees of freedom
## Multiple R-squared: 0.0121, Adjusted R-squared: 0.01077
## F-statistic: 9.087 on 1 and 742 DF, p-value: 0.002663
#library(magick)
#ggdraw() +
# cowplot::draw_image("https://i.stack.imgur.com/WDOo4.jpg?s=328&g=1") #+
# draw_plot(my_plot)
library("ggpubr")
ggarrange(
plotlist = list(p1,
ggarrange(p3, p2,
ncol = 2, nrow = 1,
widths = c(2, 3),
labels = c("B", "C"))
),
labels = c("A", ""),
ncol = 1, nrow = 2,
widths = 1, heights = c(2, 1))
Figure 1. Genomic architecture of the Cannabis sativa genome.
Figure 1. Genomic architecture of the Cannabis sativa genome. (A) Ideogram of the ERBxHO40_23 haplotype A genome. Each chromosome is divided into 1 Mbp windows where each window’s width is proportional to the inverse abundance of the motif ‘CG’ and each window is colored according to gene density. Windows with high ‘CG’ abundance are narrow and windows with high gene counts are yellow. The plant ERBxHO40_23 resulted from a cross between strains ERB (chemotype III or CBD dominant) and HO40 (chemotype I or THC dominant). The genetically female plant HO40 (XX) was chemically masculinized to produce pollen flowers to facilitate the cross. Subtelomeric repeat motifs are marked with blue horizontal lines. Cannabinoid synthase genes are marked with green horizontal lines on chromosome seven. (B) The abundances of genes and Ty3 LTRs are inversely related. Each dot represents a 1 Mbp window from the ERBxHO40_23 haplotype A assembly, organized by the number of genes and Ty3 LTR elements contained in each window. Windows with high quantities of genes have low Ty3 LTR counts. The genes in ERBxHO40 are predominantly near the ends of each chromosome, while the central portion of each chromosome appears populated by Ty3 LTR elements. (C) Long chromosomes have more genes, however some chromosomes demonstrate exceptional gene content. Chromosome Y is the longest chromosome but includes a number of genes that is comparable to other chromosomes. Each dot represents a chromosome from phased and assembled haplotypes (n = 69; X = 65; Y = 4). Chromosome 1 is of average length but contains more genes than other chromosomes of a similar length. Chromosomes X and 8 also have more genes than chromosomes of similar length. Chromosome 7, where the cannabinoid synthases (CBCAS, CBDAS, THCAS) are found, has the lowest number of genes.